{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PREPARING MODEL TRAINING DATA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialise relevant packages\n",
    "import pandas as pd\n",
    "import pickle\n",
    "\n",
    "\n",
    "# Text cleaning\n",
    "from html import unescape\n",
    "import re\n",
    "import string\n",
    "import wordsegment as ws\n",
    "import emoji\n",
    "ws.load() # load vocab for word segmentation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Raw Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load raw data\n",
    "training_data = {}\n",
    "\n",
    "training_data['davidson2017'] = pd.read_csv('./Data/Raw Training Data/davidson2017.csv', index_col=0)\n",
    "training_data['founta2018'] = pd.read_csv('./Data/Raw Training Data/founta2018.csv', names=['text', 'label', 'count_label_votes'], delimiter='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tidy Up Data Format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# specific formatting\n",
    "\n",
    "# Davidson 2017\n",
    "training_data['davidson2017'].rename(columns={\"class\": \"label\", \"tweet\": \"text\"}, inplace=True, errors='ignore')\n",
    "\n",
    "# Founta 2018\n",
    "# --> already fits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset in training_data:\n",
    "    \n",
    "    # create index column and rename to ID\n",
    "    training_data[dataset].reset_index(inplace=True)\n",
    "    training_data[dataset].rename(columns={'index': 'id'}, inplace=True, errors='ignore')\n",
    "    \n",
    "    # drop unneccessary columns\n",
    "    training_data[dataset] = training_data[dataset][['id','text','label']]\n",
    "    \n",
    "    # tidy up column types\n",
    "    training_data[dataset] = training_data[dataset].convert_dtypes()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Perform Basic Text Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define helper function for segmenting hashtags found through regex\n",
    "def regex_match_segmentation(match):\n",
    "    return ' '.join(ws.segment(match.group(0)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define function for cleaning text\n",
    "def clean_text(text):\n",
    "    \n",
    "    # convert HTML codes\n",
    "    text = unescape(text)\n",
    "    \n",
    "    # lowercase text\n",
    "    text = text.lower()\n",
    "    \n",
    "    # replace mentions, URLs and emojis with special token\n",
    "    text = re.sub(r\"@[A-Za-z0-9_-]+\",'[USER]',text)\n",
    "    text = re.sub(r\"http\\S+\",'[URL]',text)\n",
    "    text = ''.join(' [EMOJI] ' if (char in emoji.UNICODE_EMOJI) else char for char in text).strip()\n",
    "    \n",
    "    # find and split hashtags into words\n",
    "    text = re.sub(r\"#[A-Za-z0-9]+\", regex_match_segmentation, text)\n",
    "\n",
    "    # remove punctuation at beginning of string (quirk in Davidson data)\n",
    "    text = text.lstrip(\"!\")\n",
    "    \n",
    "    # remove newline and tab characters\n",
    "    text = text.replace('\\n',' ')\n",
    "    text = text.replace('\\t',' ')\n",
    "    \n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# apply text cleaner to text columns for each dataset\n",
    "for dataset in training_data:\n",
    "    training_data[dataset]['text']=training_data[dataset].text.apply(clean_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Export Multiclass Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "davidson2017\n",
      "label\n",
      "hateful       1430\n",
      "neither       4163\n",
      "offensive    19190\n",
      "Name: id, dtype: int64 \n",
      "\n",
      "founta2018\n",
      "label\n",
      "abusive    27150\n",
      "hateful     4965\n",
      "normal     53851\n",
      "spam       14030\n",
      "Name: id, dtype: int64 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# give multiclass labels string names for clarity\n",
    "# Davidson et al. (2017) --> 0 is \"hate speech\", 1 is \"offensive language\", 2 is \"neither\"\n",
    "training_data['davidson2017'].label.replace({0: \"hateful\", 1: \"offensive\", 2: \"neither\"}, inplace = True)\n",
    "\n",
    "# print class frequencies for each dataset\n",
    "for dataset in training_data:\n",
    "    print(dataset)\n",
    "    print(training_data[dataset].groupby('label').id.count(), '\\n')\n",
    "\n",
    "# save dictionary of cleaned datasets to pickle\n",
    "pickle.dump(training_data, open('./Data/Clean Training Data/training_data_multiclass.pkl','wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert to Binary Classification Task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# GOAL: hateful (1) and non-hateful (0)\n",
    "\n",
    "# Davidson et al. (2017) --> \"hateful\", \"offensive\", \"neither\"\n",
    "training_data['davidson2017'].label.replace({'hateful': 1, 'offensive': 0, 'neither': 0}, inplace = True)\n",
    "\n",
    "# Founta et al. (2018) --> \"hateful\", \"abusive\", \"normal\", \"spam\"\n",
    "training_data['founta2018'].label.replace({'hateful': 1, \"abusive\": 0, \"normal\": 0, \"spam\": 0}, inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Export Binary Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "davidson2017\n",
      "label\n",
      "0    23353\n",
      "1     1430\n",
      "Name: id, dtype: int64 \n",
      "\n",
      "founta2018\n",
      "label\n",
      "0    95031\n",
      "1     4965\n",
      "Name: id, dtype: int64 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# print class frequencies for each dataset\n",
    "for dataset in training_data:\n",
    "    print(dataset)\n",
    "    print(training_data[dataset].groupby('label').id.count(), '\\n')\n",
    "\n",
    "# save dictionary of cleaned datasets to pickle\n",
    "pickle.dump(training_data, open('./Data/Clean Training Data/training_data_binary.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
